{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW_5特征融合"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "import pickle as cPickle\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "from numpy.random import random  \n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "class RecommonderSystem:\n",
    "    def __init__(self):\n",
    "        #用户和活动新的索引\n",
    "        self.userIndex = cPickle.load(open(\"PE_userIndex.pkl\", 'rb'))\n",
    "        self.eventIndex = cPickle.load(open(\"PE_eventIndex.pkl\", 'rb'))\n",
    "        self.n_users = len(self.userIndex)\n",
    "        self.n_items = len(self.eventIndex)\n",
    "\n",
    "        #用户-活动关系矩阵R\n",
    "        #在train_SVD会重新从文件中读取,二者要求的格式不同，来不及统一了:(\n",
    "        self.userEventScores = sio.mmread(\"PE_userEventScores\").todense()\n",
    "\n",
    "        #倒排表\n",
    "        ##每个用户参加的事件\n",
    "        self.itemsForUser = cPickle.load(open(\"PE_eventsForUser.pkl\", 'rb'))\n",
    "        ##事件参加的用户\n",
    "        self.usersForItem = cPickle.load(open(\"PE_usersForEvent.pkl\", 'rb'))\n",
    "\n",
    "        #根据用户属性计算出的用户之间的相似度\n",
    "        self.userSimMatrix = sio.mmread(\"US_userSimMatrix\").todense()\n",
    "\n",
    "        #根据活动属性计算出的活动之间的相似度\n",
    "        self.eventPropSim = sio.mmread(\"EV_eventPropSim\").todense()\n",
    "        self.eventContSim = sio.mmread(\"EV_eventContSim\").todense()\n",
    "\n",
    "        #每个用户的朋友的数目\n",
    "        self.numFriends = sio.mmread(\"UF_numFriends\")\n",
    "        #用户的每个朋友参加活动的分数对该用户的影响\n",
    "        self.userFriends = sio.mmread(\"UF_userFriends\").todense()\n",
    "\n",
    "        #活动本身的热度\n",
    "        self.eventPopularity = sio.mmread(\"EA_eventPopularity\").todense()\n",
    "        \n",
    "        #基于用户的协同过滤\n",
    "        self.userCF=cPickle.load(open(\"userCF.pkl\", 'rb'))\n",
    "        \n",
    "        #基于活动的协同过滤\n",
    "        self.eventCF=cPickle.load(open(\"eventCF.pkl\", 'rb'))\n",
    "        \n",
    "        #基于模型的协同过滤\n",
    "        self.SVDCF=cPickle.load(open(\"SVDCF.pkl\", 'rb'))\n",
    "    \n",
    "    def userCF_reco(self):\n",
    "        return self.userCF\n",
    "    \n",
    "    def eventCF_reco(self):\n",
    "        return self.eventCF\n",
    "    \n",
    "    def SVDCF_reco(self):\n",
    "        return self.SVDCF\n",
    "        \n",
    "    def userPop(self, userId):\n",
    "        \"\"\"\n",
    "        基于用户的朋友个数来推断用户的社交程度\n",
    "        主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动\n",
    "        \"\"\"\n",
    "        if userId in self.userIndex:\n",
    "            i = self.userIndex[userId]\n",
    "            try:\n",
    "                return self.numFriends[0, i]\n",
    "            except IndexError:\n",
    "                return 0\n",
    "            else:\n",
    "                return 0\n",
    "\n",
    "    def friendInfluence(self, userId):\n",
    "        \"\"\"\n",
    "        朋友对用户的影响\n",
    "        主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的\n",
    "        用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响\n",
    "        \"\"\"\n",
    "        nusers = np.shape(self.userFriends)[1]\n",
    "        i = self.userIndex[userId]\n",
    "        return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]\n",
    "\n",
    "    def eventPop(self, eventId):\n",
    "        \"\"\"\n",
    "        本活动本身的热度\n",
    "        主要是通过参与的人数来界定的\n",
    "        \"\"\"\n",
    "        i = self.eventIndex[eventId]\n",
    "        return self.eventPopularity[i, 0]\n",
    "    \n",
    "    def userReco(self, userId, eventId):\n",
    "        \"\"\"\n",
    "        类似基于User-based协同过滤，只是用户之间的相似度由用户本身的属性得到，计算event的推荐度\n",
    "        基本的伪代码思路如下：\n",
    "        for item i\n",
    "          for every other user v that has a preference for i\n",
    "            compute similarity s between u and v\n",
    "            incorporate v's preference for i weighted by s into running aversge\n",
    "        return top items ranked by weighted average\n",
    "        \"\"\"\n",
    "        i = self.userIndex[userId]\n",
    "        j = self.eventIndex[eventId]\n",
    "\n",
    "        vs = self.userEventScores[:, j]\n",
    "        sims = self.userSimMatrix[i, :]\n",
    "\n",
    "        prod = sims * vs\n",
    "\n",
    "        try:\n",
    "            return prod[0, 0] - self.userEventScores[i, j]\n",
    "        except IndexError:\n",
    "            return 0\n",
    "\n",
    "    def eventReco(self, userId, eventId):\n",
    "        \"\"\"\n",
    "        类似基于Item-based协同过滤，只是item之间的相似度由item本身的属性得到，计算Event的推荐度\n",
    "        基本的伪代码思路如下：\n",
    "        for item i \n",
    "          for every item j that u has a preference for\n",
    "            compute similarity s between i and j\n",
    "            add u's preference for j weighted by s to a running average\n",
    "        return top items, ranked by weighted average\n",
    "        \"\"\"\n",
    "        i = self.userIndex[userId]\n",
    "        j = self.eventIndex[eventId]\n",
    "        js = self.userEventScores[i, :]\n",
    "        psim = self.eventPropSim[:, j]\n",
    "        csim = self.eventContSim[:, j]\n",
    "        pprod = js * psim\n",
    "        cprod = js * csim\n",
    "\n",
    "        pscore = 0\n",
    "        cscore = 0\n",
    "        try:\n",
    "            pscore = pprod[0, 0] - self.userEventScores[i, j]\n",
    "        except IndexError:\n",
    "            pass\n",
    "        try:\n",
    "            cscore = cprod[0, 0] - self.userEventScores[i, j]\n",
    "        except IndexError:\n",
    "            pass\n",
    "        return pscore, cscore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generateRSData(RS, train=True, header=True):\n",
    "    \"\"\"\n",
    "    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起\n",
    "    生成新的训练数据，用于分类器分类使用\n",
    "    \"\"\"\n",
    "    userCF = RS.userCF_reco()\n",
    "    eventCF = RS.eventCF_reco()\n",
    "    svdCF= RS.SVDCF_reco()\n",
    "    \n",
    "    fn = \"train.csv\" if train else \"test.csv\"\n",
    "    fin = open(fn, 'r+')\n",
    "    fout = open(\"RS_\" + fn, 'w+')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    fin.readline().strip().split(\",\")\n",
    "    \n",
    "    # write output header\n",
    "    if header:\n",
    "        ocolnames = [\"invited\", \"userCF\", \"evtCF\",\"svdCF\",\"user_reco\", \"evt_p_reco\",\n",
    "        \"evt_c_reco\", \"user_pop\", \"frnd_infl\", \"evt_pop\"]\n",
    "        if train:\n",
    "            ocolnames.append(\"interested\")\n",
    "            ocolnames.append(\"not_interested\")\n",
    "        fout.write(\",\".join(ocolnames) + \"\\n\")\n",
    "    \n",
    "    ln = 0\n",
    "    for line in fin:\n",
    "        ln += 1\n",
    "        if ln%500 == 0:\n",
    "            print (\"%s:%d (userId, eventId)=(%s, %s)\" % (fn, ln, userId, eventId))\n",
    "            #break;\n",
    "        cols = line.strip().split(\",\")\n",
    "        userId = cols[0]\n",
    "        eventId = cols[1]\n",
    "        \n",
    "        invited = cols[2]\n",
    "\n",
    "        userCF_reco=userCF[ln-1]\n",
    "        eventCF_reco=eventCF[ln-1]\n",
    "        SVDCF_reco=svdCF[ln-1]\n",
    "\n",
    "        user_reco = RS.userReco(userId, eventId)\n",
    "        evt_p_reco, evt_c_reco = RS.eventReco(userId, eventId)\n",
    "        user_pop = RS.userPop(userId)\n",
    "\n",
    "        frnd_infl = RS.friendInfluence(userId)\n",
    "        evt_pop = RS.eventPop(eventId)\n",
    "        ocols = [invited, userCF_reco, eventCF_reco, SVDCF_reco,user_reco, evt_p_reco, evt_c_reco, user_pop, frnd_infl, evt_pop]\n",
    "        \n",
    "        if train:\n",
    "            ocols.append(cols[4]) # interested\n",
    "            ocols.append(cols[5]) # not_interested\n",
    "        fout.write(\",\".join(map(lambda x: str(x), ocols)) + \"\\n\")\n",
    "    \n",
    "    fin.close()\n",
    "    fout.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "生成训练数据...\n",
      "\n",
      "train.csv:500 (userId, eventId)=(123290209, 1887085024)\n",
      "train.csv:1000 (userId, eventId)=(272886293, 199858305)\n",
      "train.csv:1500 (userId, eventId)=(395305791, 1582270949)\n",
      "train.csv:2000 (userId, eventId)=(527523423, 3272728211)\n",
      "train.csv:2500 (userId, eventId)=(651258472, 792632006)\n",
      "train.csv:3000 (userId, eventId)=(811791433, 524756826)\n",
      "train.csv:3500 (userId, eventId)=(985547042, 1269035551)\n",
      "train.csv:4000 (userId, eventId)=(1107615001, 173949238)\n",
      "train.csv:4500 (userId, eventId)=(1236336671, 3849306291)\n",
      "train.csv:5000 (userId, eventId)=(1414301782, 2652356640)\n",
      "train.csv:5500 (userId, eventId)=(1595465532, 955398943)\n",
      "train.csv:6000 (userId, eventId)=(1747091728, 2131379889)\n",
      "train.csv:6500 (userId, eventId)=(1914182220, 955398943)\n",
      "train.csv:7000 (userId, eventId)=(2071842684, 1076364848)\n",
      "train.csv:7500 (userId, eventId)=(2217853337, 3051438735)\n",
      "train.csv:8000 (userId, eventId)=(2338481531, 2525447278)\n",
      "train.csv:8500 (userId, eventId)=(2489551967, 520657921)\n",
      "train.csv:9000 (userId, eventId)=(2650493630, 87962584)\n",
      "train.csv:9500 (userId, eventId)=(2791418962, 4223848259)\n",
      "train.csv:10000 (userId, eventId)=(2903662804, 2791462807)\n",
      "train.csv:10500 (userId, eventId)=(3036141956, 3929507420)\n",
      "train.csv:11000 (userId, eventId)=(3176074542, 3459485614)\n",
      "train.csv:11500 (userId, eventId)=(3285425249, 2271782630)\n",
      "train.csv:12000 (userId, eventId)=(3410667855, 1063772489)\n",
      "train.csv:12500 (userId, eventId)=(3531604778, 2584839423)\n",
      "train.csv:13000 (userId, eventId)=(3686871863, 53495098)\n",
      "train.csv:13500 (userId, eventId)=(3833637800, 2415873572)\n",
      "train.csv:14000 (userId, eventId)=(3944021305, 2096772901)\n",
      "train.csv:14500 (userId, eventId)=(4075466480, 3567240505)\n",
      "train.csv:15000 (userId, eventId)=(4197193550, 1628057176)\n",
      "生成预测数据...\n",
      "\n",
      "test.csv:500 (userId, eventId)=(182290053, 2529072432)\n",
      "test.csv:1000 (userId, eventId)=(433510318, 4244463632)\n",
      "test.csv:1500 (userId, eventId)=(632808865, 2845303452)\n",
      "test.csv:2000 (userId, eventId)=(813611885, 2036538169)\n",
      "test.csv:2500 (userId, eventId)=(1010701404, 303459881)\n",
      "test.csv:3000 (userId, eventId)=(1210932037, 2529072432)\n",
      "test.csv:3500 (userId, eventId)=(1452921099, 2705317682)\n",
      "test.csv:4000 (userId, eventId)=(1623287180, 1626678328)\n",
      "test.csv:4500 (userId, eventId)=(1855201342, 2603032829)\n",
      "test.csv:5000 (userId, eventId)=(2083900381, 2529072432)\n",
      "test.csv:5500 (userId, eventId)=(2318415276, 2509151803)\n",
      "test.csv:6000 (userId, eventId)=(2528161539, 4025975316)\n",
      "test.csv:6500 (userId, eventId)=(2749110768, 4244406355)\n",
      "test.csv:7000 (userId, eventId)=(2927772127, 1532377761)\n",
      "test.csv:7500 (userId, eventId)=(3199685636, 1776393554)\n",
      "test.csv:8000 (userId, eventId)=(3393388475, 680270887)\n",
      "test.csv:8500 (userId, eventId)=(3601169721, 154434302)\n",
      "test.csv:9000 (userId, eventId)=(3828963415, 3067222491)\n",
      "test.csv:9500 (userId, eventId)=(4018723397, 2522610844)\n",
      "test.csv:10000 (userId, eventId)=(4180064266, 2658555390)\n"
     ]
    }
   ],
   "source": [
    "RS = RecommonderSystem()\n",
    "print(\"生成训练数据...\\n\")\n",
    "generateRSData(RS,train=True,  header=True)\n",
    "\n",
    "print(\"生成预测数据...\\n\")\n",
    "generateRSData(RS, train=False, header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "因为老师那边是python2 的 改着太麻烦了，就重新写了"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
